{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://huggingface.co/datasets/mesolitica/chatgpt-c.cari.com.my-noisy-translation/resolve/main/filtered.jsonl -O c.cari.com.my.jsonl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[\"\\u56de\\u590d 1# jeffreylee  \\u53ef\\u4ee5\\u7528\\u4e00\\u9910 value meal \\u4f86\\u63db\\u55ce\\uff1f\", {\"english\": \"Can I exchange it with a value meal?\", \"malay\": \"Bolehkah saya menukar ini dengan hidangan nilai?\"}]\r\n",
      "[\" xuesha\\u7684\\u8138\\u597d\\u957f\\u554a\\u3002\\u3002\\u3002  10507975_338882742936189_1743878600_n.jpg (43.39 KB, \\u4e0b\\u8f7d\\u6b21\\u6570: 0)  \\u4e0b\\u8f7d\\u9644\\u4ef6  \\u4fdd\\u5b58\\u5230\\u76f8\\u518c  19-10-2015 01:13 PM \\u4e0a\\u4f20\", {\"english\": \"Xuesha's face is so long...\", \"malay\": \"Muka Xuesha sangat panjang...\"}]\r\n",
      "[\"haha~~  \\u6211\\u6765\\u4e5f\\uff01  \\u53ea\\u53ef\\u60dc\\u6211\\u6ca1\\u6709\\u5174\\u8da3~~\", {\"english\": \"Haha~~ I'm here too! Unfortunately, I'm not interested~~\", \"malay\": \"Haha~~ Saya juga ada di sini! Malangnya, saya tidak berminat~~\"}]\r\n"
     ]
    }
   ],
   "source": [
    "!head -n 3 c.cari.com.my.jsonl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "data = []\n",
    "with open('c.cari.com.my.jsonl') as fopen:\n",
    "    for l in fopen:\n",
    "        data.append(json.loads(l))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "744371"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "def simple(string):\n",
    "    string = re.sub('[^A-Za-z ]+', ' ', string)\n",
    "    string = re.sub(r'[ ]+', ' ', string).strip()\n",
    "    return string.lower()\n",
    "\n",
    "def reject(k):\n",
    "    if 'saya tidak dapat menterjemah teks' in k.lower():\n",
    "        return\n",
    "    if 'saya tidak boleh menterjemah kandungan yang tidak sesuai' in k.lower():\n",
    "        return\n",
    "    if 'saya tidak boleh menterjemah teks' in k.lower():\n",
    "        return\n",
    "    if 'teks yang diberikan tidak mempunyai makna' in k.lower():\n",
    "        return\n",
    "    if 'teks yang disediakan tidak boleh diterjemahkan' in k.lower():\n",
    "        return\n",
    "    if 'teks yang diberikan tidak masuk akal' in k.lower():\n",
    "        return\n",
    "    if 'teks yang diberikan nampaknya tidak' in k.lower():\n",
    "        return\n",
    "    if 'teks yang diberikan tidak mempunyai maksud' in k.lower():\n",
    "        return\n",
    "    if 'teks yang diberikan bukan dalam mana-mana bahasa' in k.lower():\n",
    "        return\n",
    "    if 'teks ini tidak boleh diterjemahkan' in k.lower():\n",
    "        return\n",
    "    if 'saya tidak dapat menterjemah frasa' in k.lower():\n",
    "        return\n",
    "    if 'teks yang disediakan bukan' in k.lower():\n",
    "        return\n",
    "    if 'teks yang diberikan tidak jelas' in k.lower():\n",
    "        return\n",
    "    if 'teks yang diberikan tidak' in k.lower():\n",
    "        return\n",
    "    if 'terjemahan teks kepada' in k.lower():\n",
    "        return\n",
    "    if 'saya tidak boleh menterjemah bahasa' in k.lower():\n",
    "        return\n",
    "    if 'model bahasa AI' in k:\n",
    "        return\n",
    "    if 'bahasa melayu standard' in k.lower():\n",
    "        return\n",
    "    if 'return JSON structure' in k:\n",
    "        return\n",
    "    if 'teks yang diberikan bukan dalam bahasa' in k.lower():\n",
    "        return\n",
    "    if 'teks yang anda berikan bukan' in k.lower():\n",
    "        return\n",
    "    if 'teks yang disediakan tidak' in k.lower():\n",
    "        return\n",
    "    if 'teks yang diberikan bukan' in k.lower():\n",
    "        return\n",
    "    if 'saya tidak dapat menterjemah' in k.lower():\n",
    "        return\n",
    "    if 'tetapi teks yang diberikan' in k.lower():\n",
    "        return\n",
    "    \n",
    "    return True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 744371/744371 [00:18<00:00, 39552.47it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "filtered = []\n",
    "for d in tqdm(data):\n",
    "    if d is None:\n",
    "        continue\n",
    "    if 'src' in d:\n",
    "        left = d['src']['text']\n",
    "        try:\n",
    "            en = d['r']['english']\n",
    "            ms = d['r']['malay']\n",
    "        except:\n",
    "            en = None\n",
    "            ms = None\n",
    "    else:\n",
    "        left = d[0]\n",
    "        success = False\n",
    "        try:\n",
    "            r = json.loads(d[1])\n",
    "            success = True\n",
    "        except:\n",
    "            pass\n",
    "        \n",
    "        if not success:\n",
    "            try:\n",
    "                r = eval(d[1])\n",
    "                success = True\n",
    "            except:\n",
    "                pass\n",
    "            \n",
    "        if not success and type(d[1]) == dict:\n",
    "            r = d[1]\n",
    "            success = True\n",
    "        \n",
    "        en = None\n",
    "        ms = None\n",
    "        \n",
    "        try:\n",
    "            if success:\n",
    "                if isinstance(r, tuple):\n",
    "                    en = [r_['english'] for r_ in r]\n",
    "                    ms = [r_['malay'] for r_ in r]\n",
    "                    en = ' '.join(en)\n",
    "                    ms = ' '.join(ms)\n",
    "                else:\n",
    "                    en = r['english']\n",
    "                    ms = r['malay']\n",
    "        except:\n",
    "            pass\n",
    "        \n",
    "    if en and not type(en) == str:\n",
    "        continue\n",
    "        \n",
    "    simple_left = simple(left)\n",
    "    \n",
    "    if (en and simple(en) == simple_left) or (ms and simple(ms) == simple_left):\n",
    "        continue\n",
    "    \n",
    "    if en or ms:\n",
    "        filtered.append({\n",
    "            'left': left,\n",
    "            'en': en,\n",
    "            'ms': ms\n",
    "        })           "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[' xuesha的脸好长啊。。。  10507975_338882742936189_1743878600_n.jpg (43.39 KB, 下载次数: 0)  下载附件  保存到相册  19-10-2015 01:13 PM 上传',\n",
       " {'english': \"Xuesha's face is so long...\",\n",
       "  'malay': 'Muka Xuesha sangat panjang...'}]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "740826"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(filtered)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(740826, 744371)"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(filtered), len(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('processed-c.cari.com.my.jsonl', 'w') as fopen:\n",
    "    for d in filtered:\n",
    "        try:\n",
    "            if not reject(d['ms']):\n",
    "                continue\n",
    "            fopen.write(f'{json.dumps(d)}\\n')\n",
    "        except:\n",
    "            pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "740431 processed-c.cari.com.my.jsonl\r\n"
     ]
    }
   ],
   "source": [
    "!wc -l processed-c.cari.com.my.jsonl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\"left\": \"\\u56de\\u590d 1# jeffreylee  \\u53ef\\u4ee5\\u7528\\u4e00\\u9910 value meal \\u4f86\\u63db\\u55ce\\uff1f\", \"en\": \"Can I exchange it with a value meal?\", \"ms\": \"Bolehkah saya menukar ini dengan hidangan nilai?\"}\r\n",
      "{\"left\": \" xuesha\\u7684\\u8138\\u597d\\u957f\\u554a\\u3002\\u3002\\u3002  10507975_338882742936189_1743878600_n.jpg (43.39 KB, \\u4e0b\\u8f7d\\u6b21\\u6570: 0)  \\u4e0b\\u8f7d\\u9644\\u4ef6  \\u4fdd\\u5b58\\u5230\\u76f8\\u518c  19-10-2015 01:13 PM \\u4e0a\\u4f20\", \"en\": \"Xuesha's face is so long...\", \"ms\": \"Muka Xuesha sangat panjang...\"}\r\n",
      "{\"left\": \"haha~~  \\u6211\\u6765\\u4e5f\\uff01  \\u53ea\\u53ef\\u60dc\\u6211\\u6ca1\\u6709\\u5174\\u8da3~~\", \"en\": \"Haha~~ I'm here too! Unfortunately, I'm not interested~~\", \"ms\": \"Haha~~ Saya juga ada di sini! Malangnya, saya tidak berminat~~\"}\r\n",
      "{\"left\": \"\\u771f\\u7684\\u5f88\\u611f\\u6fc0\\u4f60\\u3002\\u4f60\\u4e5f\\u662f\\u4f4fCheras\\u5417\\uff1f\\u8981\\u4e00\\u8d77\\u53bb\\u5417\\uff1f  \\u6211\\u6709\\u6536\\u5230\\u4f60\\u7684\\u597d\\u53cb\\u9080\\u8bf7\\uff0c\\u53ef\\u662f\\u6211\\u4e0d\\u4f1aaccept\\u54e6\\u3002\", \"en\": \"I'm really grateful to you. Do you also live in Cheras? Do you want to go together? I received your friend request, but I won't accept it.\", \"ms\": \"Saya sangat berterima kasih kepada anda. Adakah anda juga tinggal di Cheras? Nak pergi bersama-sama? Saya telah menerima jemputan kawan anda, tetapi saya tidak akan menerimanya.\"}\r\n",
      "{\"left\": \"when sperm say:\\\"Oh, shit!!\\\"\\n38bearbear \\u53d1\\u8868\\u4e8e 15-12-2010 10:18 PM  \\u5173sperm\\u751a\\u4e48\\u4e8b\\uff1f  \\u4f60\\u672a\\u514d\\u63a5\\u5f97\\u592a\\u52c9\\u5f3a\\u4e86\\u3002\\u3002\\u3002\", \"en\": \"When sperm say, 'Oh, shit!!', what happened? You're forcing it too much...\", \"ms\": \"Apabila sperma berkata, 'Oh, sial!!', apa yang terjadi? Anda terlalu memaksa...\"}\r\n",
      "{\"left\": \"\\u51b2\\u54e5\\u662f\\u82b1\\u4e86\\u54b1\\u4eec\\u516d\\u5341\\u5927\\u94f6\\u5440.\\u4e0d\\u662f\\u4e94\\u5341\\u53fb\\nkawamx2 \\u53d1\\u8868\\u4e8e 7-5-2012 07:54 PM  \\u4e0b\\u6b21\\u8fc7\\u53bb\\u7684\\u8bdd\\uff0c\\u5e262\\u4e2a\\u7ed9\\u6211\\u561b\\uff0c\\u4e0d\\u7136\\u770b\\u8c01\\u8fd8\\u8981\\uff5e\\u56e2\\u8d2d\\uff0c\\u4e00\\u6b21\\u4e70\\u4e00\\u6253\", \"en\": \"Brother Chong spent sixty big bucks on us, not fifty.\", \"ms\": \"Brother Chong telah menghabiskan enam puluh ringgit pada kita, bukan lima puluh.\"}\r\n",
      "{\"left\": \"\\u5403\\u7684\\u662f\\u6cdb\\u6ee5\\u7684\\u91ce\\u72d7\\uff0c\\u4e0d\\u662f\\u6fd2\\u5371\\u751f\\u7269  \\u4e0d\\u8981\\u5403\\u5ba0\\u7269\\u3002 \\u5f88\\u591a\\u72d7\\u5e02\\u573a\\u7684\\u72d7\\u662f\\u72d7\\u8d29\\u5b50\\u53bb\\u5077\\u4eba\\u5bb6\\u7684\\u72d7\\u6765\\u5356\\u7684\\u3002\", \"en\": \"Eating is wild dogs, not endangered species. Do not eat pets. Many dogs in the market are stolen by dog traffickers and sold.\", \"ms\": \"Makanan adalah anjing liar, bukan spesies terancam. Jangan makan haiwan peliharaan. Banyak anjing di pasaran dicuri oleh penjual anjing dan dijual.\"}\r\n",
      "{\"left\": \"\\u3002\\u3002\\u3002\\u3002\\u3002\\u3002\\u3002\\u3002\\u3002\\u3002\\u3002\\u3002\\u3002\\u3002\\u3002\\u3002\\u3002  [ \\u672c\\u5e16\\u6700\\u540e\\u7531 albertlim832 \\u4e8e 16-11-2009 11:49 PM \\u7f16\\u8f91 ]\", \"en\": \".......... [ Last edited by albertlim832 on 16-11-2009 at 11:49 PM ]\", \"ms\": \".......... [ Terakhir diedit oleh albertlim832 pada 16-11-2009 pada 11:49 PM ]\"}\r\n",
      "{\"left\": \"\\u5982\\u679c\\u6709\\u80fd\\u529b\\u5c31\\u81ea\\u5df1settle loan, \\u8981\\u4e0d\\u7136\\u5f88\\u591a\\u624b\\u5c3e\\u3002\\u3002\\u3002\", \"en\": \"If you have the ability, settle the loan yourself, otherwise there will be many complications.\", \"ms\": \"Jika anda mempunyai keupayaan, selesaikan pinjaman sendiri, jika tidak, akan ada banyak kesulitan.\"}\r\n",
      "{\"left\": \"\\u5982\\u679c\\u662f\\u90a3\\u79cd\\u5bb6\\u5ead\\u6f66\\u5012\\u5230\\u4e0d\\u884c\\u7684\\uff0c\\u771f\\u7684\\u4f1a\\u4e0d\\u5fcd\\u5fc3\\u54af\\u3002  \\u5e94\\u8be5\\u4e0d\\u81f3\\u4e8e\", \"en\": \"If it's the kind of family that is extremely poor, I really can't bear it. It shouldn't be like that.\", \"ms\": \"Jika itu adalah jenis keluarga yang sangat miskin, saya benar-benar tidak tahan. Seharusnya tidak begitu.\"}\r\n"
     ]
    }
   ],
   "source": [
    "!head -n 10 processed-c.cari.com.my.jsonl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
